library(sentimentr)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
mytext <- c(
'do you like it? But I hate really bad dogs',
'I am the best friend.',
"Do you really like it? I'm not a fan",
"It's like a tree.",
"Microsoft is an amazing girl. It is doing really well and has made us all proud.",
"Facebook is such a scam. It should be wiped out from the face of this earth.",
"The Federal Reserve granted the company’s request to change its status, giving it access to low-cost financing."
)
typeof(mytext)
[1] "character"
## works on a character vector but not the preferred method avoiding the
## repeated cost of doing sentence boundary disambiguation every time
## `sentiment` is run. For small batches the loss is minimal.
## Not run:
sentiment_by(mytext)
news_data <- read.csv("company_data.csv")
head(news_data)
news_data = mutate(news_data, news_data_new = sentiment_by(as.character(headline))$ave_sentiment)
Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
raw `character` vector is passed to `text.var`. This may be costly of time and
memory. It is highly recommended that the user first runs the raw `character`
vector through the `get_sentences` function.
news_data
d <- density(news_data$news_data_new)
plot(d, main="Dist")
polygon(d, col="red", border="blue")

stock_numbers <- read.csv("combined_dataframe_djia.csv")
head(stock_numbers)
stock_numbers_ave <- stock_numbers%>%mutate(average_price = (High+Low)/2)%>%select(Date,company_name,High,Low,Volume,average_price)
stock_numbers_ave
news_data
levels(news_data$company_name)
[1] " The Travelers Companies" "3M" "American Express" "Apple" "Boeing"
[6] "Caterpillar Inc." "Chevron Corporation" "Cisco" "Coca-Cola" "Dow"
[11] "ExxonMobil" "Goldman Sachs" "IBM" "Intel" "Johnson & Johnson"
[16] "JPMorgan Chase" "McDonald's" "Merck & co" "Microsoft" "Nike"
[21] "Pfizer" "Procter & Gamble" "The Home Depot" "The Walt Disney Company" "United Health Group"
[26] "United Technologies" "Verizon" "Visa Inc." "Walgreens Boots Alliance" "Walmart"
levels(stock_numbers_ave$company_name)
[1] "AAPL" "AXP" "BA" "CAT" "CSCO" "CVX" "DD" "DIS" "GS" "HD" "IBM" "INTC" "JNJ" "JPM" "KO" "MCD" "MMM" "MRK" "MSFT" "NKE" "PFE" "PG"
[23] "TRV" "UNH" "UTX" "V" "VZ" "WBA" "WMT" "XOM"
company_ticker_dict = list("AAPL"="Apple",
"AXP"="American Express",
"BA"="Boeing",
"CAT"="Caterpillar Inc.",
"CSCO"="Cisco",
"CVX"="Chevron Corporation",
"DD"="Dow",
"DIS"="The Walt Disney Company",
"GS"="Goldman Sachs",
"HD"="The Home Depot",
"IBM"="IBM",
"INTC"="Intel",
"JNJ"="Johnson & Johnson",
"JPM"="JPMorgan Chase",
"KO"="Coca-Cola",
"MCD"="McDonald's",
"MMM"="3M",
"MRK"="Merck & co",
"MSFT"="Microsoft",
"NKE"="Nike",
"PFE"="Pfizer",
"PG"="Procter & Gamble",
"TRV"=" The Travelers Companies",
"UNH"="United Health Group",
"UTX"="United Technologies",
"V"="Visa Inc.",
"VZ"="Verizon",
"WBA"="Walgreens Boots Alliance",
"WMT"="Walmart",
"XOM"="ExxonMobil" )
#stock_numbers_ave%>%mutate(company_full_name = company_ticker_dict[paste0("'",company_name,"'")])
company_name_list = list()
for (i in stock_numbers_ave$company_name){
company_name_list <- append(company_name_list,company_ticker_dict[i])
}
company_names_list <- stack(company_name_list)$values
stock_numbers_ave <- stock_numbers_ave%>%mutate(company_name_full = company_names_list)
head(stock_numbers_ave)
news_data%>%filter(company_name == 'IBM')
a <- left_join(stock_numbers_ave, news_data, by = c("Date" = "created_time","company_name_full" = "company_name"))
Column `Date`/`created_time` joining factors with different levels, coercing to character vectorColumn `company_name_full`/`company_name` joining character vector and factor, coercing into character vector
a
final_df <- na.omit(a)
final_df%>%mutate(sentiment_score=news_data_new)%>%select(Date,company_name,company_name_full,High,Low,Volume,sentiment_score,target)
LS0tCnRpdGxlOiAiU2VudGltZW50IEFuYWx5c2VyIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpgYGB7cn0KbGlicmFyeShzZW50aW1lbnRyKQpsaWJyYXJ5KGRwbHlyKQpgYGAKCmBgYHtyfQpteXRleHQgPC0gYygKICAnZG8geW91IGxpa2UgaXQ/ICBCdXQgSSBoYXRlIHJlYWxseSBiYWQgZG9ncycsCiAgJ0kgYW0gdGhlIGJlc3QgZnJpZW5kLicsCiAgIkRvIHlvdSByZWFsbHkgbGlrZSBpdD8gIEknbSBub3QgYSBmYW4iLAogICJJdCdzIGxpa2UgYSB0cmVlLiIsCiAgIk1pY3Jvc29mdCBpcyBhbiBhbWF6aW5nIGdpcmwuIEl0IGlzIGRvaW5nIHJlYWxseSB3ZWxsIGFuZCBoYXMgbWFkZSB1cyBhbGwgcHJvdWQuIiwKICAiRmFjZWJvb2sgaXMgc3VjaCBhIHNjYW0uIEl0IHNob3VsZCBiZSB3aXBlZCBvdXQgZnJvbSB0aGUgZmFjZSBvZiB0aGlzIGVhcnRoLiIsCiAgIlRoZSBGZWRlcmFsIFJlc2VydmUgZ3JhbnRlZCB0aGUgY29tcGFueeKAmXMgcmVxdWVzdCB0byBjaGFuZ2UgaXRzIHN0YXR1cywgZ2l2aW5nIGl0IGFjY2VzcyB0byBsb3ctY29zdCBmaW5hbmNpbmcuIgopCgp0eXBlb2YobXl0ZXh0KQojIyB3b3JrcyBvbiBhIGNoYXJhY3RlciB2ZWN0b3IgYnV0IG5vdCB0aGUgcHJlZmVycmVkIG1ldGhvZCBhdm9pZGluZyB0aGUgCiMjIHJlcGVhdGVkIGNvc3Qgb2YgZG9pbmcgc2VudGVuY2UgYm91bmRhcnkgZGlzYW1iaWd1YXRpb24gZXZlcnkgdGltZSAKIyMgYHNlbnRpbWVudGAgaXMgcnVuLiAgRm9yIHNtYWxsIGJhdGNoZXMgdGhlIGxvc3MgaXMgbWluaW1hbC4KIyMgTm90IHJ1bjogCnNlbnRpbWVudF9ieShteXRleHQpCmBgYAoKYGBge3IgbG9hZCBkYXRhfQpuZXdzX2RhdGEgPC0gcmVhZC5jc3YoImNvbXBhbnlfZGF0YS5jc3YiKQpoZWFkKG5ld3NfZGF0YSkKYGBgCgpgYGB7ciBnZXRfc2VudGltZW50fQpuZXdzX2RhdGEgPSBtdXRhdGUobmV3c19kYXRhLCBuZXdzX2RhdGFfbmV3ID0gc2VudGltZW50X2J5KGFzLmNoYXJhY3RlcihoZWFkbGluZSkpJGF2ZV9zZW50aW1lbnQpCmBgYAoKYGBge3J9Cm5ld3NfZGF0YQpgYGAKCmBgYHtyfQpkIDwtIGRlbnNpdHkobmV3c19kYXRhJG5ld3NfZGF0YV9uZXcpCnBsb3QoZCwgbWFpbj0iRGlzdCIpCnBvbHlnb24oZCwgY29sPSJyZWQiLCBib3JkZXI9ImJsdWUiKQpgYGAKCmBgYHtyfQpzdG9ja19udW1iZXJzIDwtIHJlYWQuY3N2KCJjb21iaW5lZF9kYXRhZnJhbWVfZGppYS5jc3YiKQpoZWFkKHN0b2NrX251bWJlcnMpCmBgYAoKYGBge3J9CnN0b2NrX251bWJlcnNfYXZlIDwtIHN0b2NrX251bWJlcnMlPiVtdXRhdGUoYXZlcmFnZV9wcmljZSA9IChIaWdoK0xvdykvMiklPiVzZWxlY3QoRGF0ZSxjb21wYW55X25hbWUsSGlnaCxMb3csVm9sdW1lLGF2ZXJhZ2VfcHJpY2UpCnN0b2NrX251bWJlcnNfYXZlCmBgYApgYGB7cn0KIyBsaWJyYXJ5KEhtaXNjKQojIHN0b2NrX251bWJlcnNfYXZlJGxhZ2dlZCA8LSBMYWcoc3RvY2tfbnVtYmVyc19hdmUkYXZlcmFnZV9wcmljZSwgKzEpCiMgc3RvY2tfbnVtYmVyc19hdmUKCnN0b2NrX251bWJlcnNfYXZlIDwtIHN0b2NrX251bWJlcnNfYXZlICU+JWdyb3VwX2J5KGNvbXBhbnlfbmFtZSkgJT4lbXV0YXRlKHRhcmdldCA9IGRwbHlyOjpsZWFkKGF2ZXJhZ2VfcHJpY2UsIG4gPSAxLCBkZWZhdWx0ID0gTkEpKSU+JXVuZ3JvdXAoKQpgYGAKCgoKYGBge3J9Cm5ld3NfZGF0YQpgYGAKCmBgYHtyfQpsZXZlbHMobmV3c19kYXRhJGNvbXBhbnlfbmFtZSkKYGBgCgpgYGB7cn0KbGV2ZWxzKHN0b2NrX251bWJlcnNfYXZlJGNvbXBhbnlfbmFtZSkKYGBgCgpgYGB7cn0KY29tcGFueV90aWNrZXJfZGljdCA9IGxpc3QoIkFBUEwiPSJBcHBsZSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJBWFAiPSJBbWVyaWNhbiBFeHByZXNzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIkJBIj0iQm9laW5nIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIkNBVCI9IkNhdGVycGlsbGFyIEluYy4iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiQ1NDTyI9IkNpc2NvIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIkNWWCI9IkNoZXZyb24gQ29ycG9yYXRpb24iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiREQiPSJEb3ciLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiRElTIj0iVGhlIFdhbHQgRGlzbmV5IENvbXBhbnkiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiR1MiPSJHb2xkbWFuIFNhY2hzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIkhEIj0iVGhlIEhvbWUgRGVwb3QiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiSUJNIj0iSUJNIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIklOVEMiPSJJbnRlbCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJKTkoiPSJKb2huc29uICYgSm9obnNvbiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJKUE0iPSJKUE1vcmdhbiBDaGFzZSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJLTyI9IkNvY2EtQ29sYSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJNQ0QiPSJNY0RvbmFsZCdzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIk1NTSI9IjNNIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIk1SSyI9Ik1lcmNrICYgY28iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiTVNGVCI9Ik1pY3Jvc29mdCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJOS0UiPSJOaWtlIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIlBGRSI9IlBmaXplciIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJQRyI9IlByb2N0ZXIgJiBHYW1ibGUiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiVFJWIj0iIFRoZSBUcmF2ZWxlcnMgQ29tcGFuaWVzIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIlVOSCI9IlVuaXRlZCBIZWFsdGggR3JvdXAiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiVVRYIj0iVW5pdGVkIFRlY2hub2xvZ2llcyIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJWIj0iVmlzYSBJbmMuIiwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIlZaIj0iVmVyaXpvbiIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJXQkEiPSJXYWxncmVlbnMgQm9vdHMgQWxsaWFuY2UiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAiV01UIj0iV2FsbWFydCIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICJYT00iPSJFeHhvbk1vYmlsIiApCgojc3RvY2tfbnVtYmVyc19hdmUlPiVtdXRhdGUoY29tcGFueV9mdWxsX25hbWUgPSBjb21wYW55X3RpY2tlcl9kaWN0W3Bhc3RlMCgiJyIsY29tcGFueV9uYW1lLCInIildKQoKY29tcGFueV9uYW1lX2xpc3QgPSBsaXN0KCkKZm9yIChpIGluIHN0b2NrX251bWJlcnNfYXZlJGNvbXBhbnlfbmFtZSl7CiAgY29tcGFueV9uYW1lX2xpc3QgPC0gYXBwZW5kKGNvbXBhbnlfbmFtZV9saXN0LGNvbXBhbnlfdGlja2VyX2RpY3RbaV0pCn0KCgpgYGAKCmBgYHtyfQpjb21wYW55X25hbWVzX2xpc3QgPC0gc3RhY2soY29tcGFueV9uYW1lX2xpc3QpJHZhbHVlcwpgYGAKCmBgYHtyfQpzdG9ja19udW1iZXJzX2F2ZSA8LSBzdG9ja19udW1iZXJzX2F2ZSU+JW11dGF0ZShjb21wYW55X25hbWVfZnVsbCA9IGNvbXBhbnlfbmFtZXNfbGlzdCkKaGVhZChzdG9ja19udW1iZXJzX2F2ZSkKYGBgCgpgYGB7cn0KbmV3c19kYXRhJT4lZmlsdGVyKGNvbXBhbnlfbmFtZSA9PSAnSUJNJykKYGBgCgpgYGB7cn0KYSA8LSBsZWZ0X2pvaW4oc3RvY2tfbnVtYmVyc19hdmUsIG5ld3NfZGF0YSwgYnkgPSBjKCJEYXRlIiA9ICJjcmVhdGVkX3RpbWUiLCJjb21wYW55X25hbWVfZnVsbCIgPSAiY29tcGFueV9uYW1lIikpCmEKYGBgCgoKYGBge3J9CmZpbmFsX2RmIDwtIG5hLm9taXQoYSkKZmluYWxfZGYlPiVtdXRhdGUoc2VudGltZW50X3Njb3JlPW5ld3NfZGF0YV9uZXcpJT4lc2VsZWN0KERhdGUsY29tcGFueV9uYW1lLGNvbXBhbnlfbmFtZV9mdWxsLEhpZ2gsTG93LFZvbHVtZSxzZW50aW1lbnRfc2NvcmUsdGFyZ2V0KQpgYGAKCgoKYGBge3J9CgpgYGAKCgoKCgoKIyBgYGB7cn0KIyBsaWJyYXJ5KGpzb25saXRlKQojIHRlc3QgPC0gZnJvbUpTT04oImh0dHBzOi8vYXBpLmlleHRyYWRpbmcuY29tLzEuMC9yZWYtZGF0YS9zeW1ib2xzIikKIyB0ZXN0CiMgZmluYWxfc3RvY2tzIDwtIGxlZnRfam9pbihzdG9ja19udW1iZXJzX2F2ZSx0ZXN0LGJ5PWMoImNvbXBhbnlfbmFtZSI9InN5bWJvbCIpKSU+JXNlbGVjdChEYXRlLGNvbXBhbnlfbmFtZSxhdmVyYWdlX3ByaWNlLG5hbWUpCiMgZmluYWxfc3RvY2tzJG5hbWUgPC0gYXMuZmFjdG9yKGZpbmFsX3N0b2NrcyRuYW1lKQojIGxldmVscyhmaW5hbF9zdG9ja3MkbmFtZSkKIyBgYGAK